I'm training xgboost for speed dating dataset. Model performance is very high:
I've choosen two quite different observation but belonging to the same class:
predictions are:
Python: dalex and shap
I have choosen two observation randomly just to see differences between both packages (this is not for 4. subtask).
Dalex and Shap respectively - orders of variables slightly changes:
same for second observation - but here also most influentian variable changes - reason is that dalex estimates Shapley values (in Dalex predict_part returns #numer of variables * 25 rows which are sort of estimates and are then aggregated):
Number of unique pairs of observations which differs at two top position of variables ordered by absolute Shapley value is 24081 for SHAP package and 25827 for DALEX. Which is reasonable as DALEX aproximates Shape values which injects additional randomnes into variables ordering. Calculaction for DALEX have been obtained by agregating results from predict_parts method.
First pair of observation which differs by top two variables by abs Shapley value - results from Shap package:
Same pair of observations plot with Dalex - we can see that two first variables from obs 601 are again much below two top places in obs 1742, unfortunately top places between Dalex and Shap changes for each observation:
I've iterated over are possible pairs of variables and found that this two observations are most different from each other in terms of abs difference of Shapley for shared_interests_o variable (Shap package):
The same task solved with Dalex. Shapley values for variable shared_interests_o for all obs in test dataset have been calculated. The same pair of variables have been found - so difference between Shapley values between packages are not so huge.
Done above.
I choose logistic regression model. Data have to been scaled in adventage to assure convergence which is additional complication in pipeline, as preprocessing.scalar.preprocessing return numpy array, and we need pandas' df for xai methods (variable names).
I've searched for two variables in test dataset which Shapley values profiles differs most between xgboost and ogistic regression, obtained difference is huge because different classes have been predicted by each model:
We have six permutations:
(A, B, C): Contribution of A is v(A) - v(empty set) = 20 - 0 = 20
(A, C, B): Contribution of A is v(A) - v(empty set) = 20 - 0 = 20
(B, A, C): Contribution of A is v(A, B) - v(B) = 60 - 20 = 40
(C, A, B): Contribution of A is v(A, C) - v(C) = 70 - 60 = 10
(B, C, A): Contribution of A is v(A, B, C) - v(B, C) = 100 - 70 = 30
(C, B, A): Contribution of A is v(A, B, C) - v(B, C) = 100 - 70 = 30
The average contribution of A during building the coalition is (20 + 20 + 40 + 10 + 30 + 30) / 6 = 25, which is the Shapley value of player (variable) A.
SEED = 42
import numpy as np
import pandas as pd
from os.path import exists
repo_url = "https://raw.githubusercontent.com/adrianstando/imbalanced-benchmarking-set/main/datasets/"
dataset = "SpeedDating.csv" # ♥
if exists(dataset):
# print("File already exists, loading file")
df = pd.read_csv(dataset, index_col=0)
else:
try:
df = pd.read_csv(repo_url + dataset, index_col=0)
df.to_csv(dataset)
except:
print("somthing is not yes")
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
y = df['TARGET']
X = df.drop(columns=['TARGET'])
X_train, X_test, y_train, y_test = train_test_split(X.iloc[:,:10], y, test_size=0.33, random_state=SEED)
# 1. Train a tree-based ensemble model on the selected dataset; it can be one of random forest,
# GBM, CatBoost, XGBoost, LightGBM (various types) etc.
import xgboost as xgb
print(xgb.__version__)
xgb_clf = xgb.XGBClassifier(
objective='binary:logistic',
max_depth=10,
learning_rate=0.1,
n_estimators=100,
# early_stopping_rounds=3 # can't use directly in CV setting with train-test split only (val dataset would be needed)
# scale_pos_weight=df.TARGET.mean() # using 'scale_pos_weight' this way would be data leakage
)
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
print(f"{accuracy_score(y_test, y_pred)=:.5f}\n{1-np.mean(y_test)=:.5f}")
confusion_matrix(y_test, y_pred)
X_test.index
# 2. Select two observations from the dataset and calculate the model's prediction.
obs = [601, 53]
xgb_clf.predict(X_test.loc[obs, :])
pred = xgb_clf.predict_proba(X_test.loc[obs, :])
pd.DataFrame(pred, columns=["prob_cls_0", "prob_cls_1"], index=["obs 601", "obs 53"])
X_test.loc[obs,:].T
try:
import shap
except:
!pip install -q shap
import shap
try:
import dalex as dx
except:
!pip install -q dalex
import dalex as dx
from tqdm.notebook import tqdm
dx_explainer = dx.Explainer(xgb_clf, X_train, y_train)
dx_explainer.model_performance()
i = obs[0]
dx_shap_values = dx_explainer.predict_parts(type="shap", label=f'observation {i}',
new_observation=X_test.loc[i,:])
dx_shap_values.plot()
shap_explainer = shap.TreeExplainer(xgb_clf, data=X_train, model_output="probability")
shap_values = shap_explainer(X_test.loc[[obs[0]],:]) # shap_values # = shap_values[:, :, 1]
print('xgboost obs 601')
shap.waterfall_plot(shap_values[0])
i = obs[1]
dx_shap_values = dx_explainer.predict_parts(type="shap", label=f'observation {i}',
new_observation=X_test.loc[i,:])
dx_shap_values.plot()
shap_explainer = shap.TreeExplainer(xgb_clf, data=X_train, model_output="probability")
shap_values = shap_explainer(X_test.loc[[obs[1]],:]) # shap_values # = shap_values[:, :, 1]
print('xgboost obs 53')
shap.waterfall_plot(shap_values[0])
shap_values.values
# for 'shap' package
explainer = shap.Explainer(xgb_clf)
shap_values = explainer.shap_values(X_test)
shap_df = pd.DataFrame(shap_values, columns=X_test.columns, index=X_test.index)
shap_df_ = shap_df.copy()
shap_df['second_max_shap_feature'] = shap_df.abs().apply(lambda row: row.sort_values(ascending=False).index[1], axis=1)
shap_df['max_shap_feature'] = shap_df.iloc[:,:-1].abs().idxmax(axis=1)
shap_df.iloc[:,-5:]
# O(n^2) operations
obs_different_max_shaps = []
for i in range(len(shap_df)):
for j in range(i+1, len(shap_df)):
features_i = set([shap_df.iloc[i]['max_shap_feature'], shap_df.iloc[i]['second_max_shap_feature']])
features_j = set([shap_df.iloc[j]['max_shap_feature'], shap_df.iloc[j]['second_max_shap_feature']])
if set.isdisjoint(features_i, features_j):
obs_different_max_shaps.append((i, j))
f"for SHAP number of different pairs of obseration with two different variables with max abs shap value is {len(obs_different_max_shaps)}"
obs1, obs2 = obs_different_max_shaps[0]
obs1 = X_test.index[obs1]
obs2 = X_test.index[obs2]
shap_values = shap_explainer(X_test.loc[[obs1],:])
print(f'xgboost, two very different explanations, observation {obs1=}"')
shap.waterfall_plot(shap_values[0])
shap_values = shap_explainer(X_test.loc[[obs2],:])
print(f'xgboost, two very different explanations, observation {obs2=}"')
shap.waterfall_plot(shap_values[0])
X_test.columns
# lets choose to most distinct such observation over 'shared_interests_o' variable
obs_min_shap = shap_df['shared_interests_o'].idxmin()
obs_max_shap = shap_df['shared_interests_o'].idxmax()
shap_values = shap_explainer(X_test.loc[[obs_min_shap],:]) # shap_values # = shap_values[:, :, 1]
print(f'xgboost, min obs={obs_min_shap} val of "shared_interests_o"')
shap.waterfall_plot(shap_values[0])
shap_values = shap_explainer(X_test.loc[[obs_max_shap],:]) # shap_values # = shap_values[:, :, 1]
print(f'xgboost, max obs={obs_max_shap} val of "shared_interests_o"')
shap.waterfall_plot(shap_values[0])
dx_explainer = dx.Explainer(xgb_clf, X_train, y_train)
dx_explainer.predict(X_test).shape
# 6. (How) Do the results differ across the two packages selected in point (3) ?
#O(n^2)
shap_values_list = []
# Iteracja po wierszach DataFrame X_test i obliczanie wartości SHAP dla każdej obserwacji
# Trzeba agregować
i=0
for index, row in X_test.iterrows():
shap_values_instance = dx_explainer.predict_parts(new_observation=row.to_frame().T, type='shap')
x = shap_values_instance.result[["variable_name", "contribution"]]
x.set_index("variable_name", inplace=True)
x = x.groupby("variable_name").apply("mean").T
x.rename({"contribution":index}, inplace=True)
shap_values_list.append(x)
if i%10==0:
print(i)
i+=1
shap_df_dx = pd.concat(shap_values_list)
shap_df_dx
shap_df_dx_ = shap_df_dx.copy()
shap_df_dx['second_max_shap_feature'] = shap_df_dx.abs().apply(lambda row: row.sort_values(ascending=False).index[1], axis=1)
shap_df_dx['max_shap_feature'] = shap_df_dx.iloc[:,:-1].abs().idxmax(axis=1)
shap_df_dx.iloc[:,-5:]
obs_different_max_shaps = []
for i in range(len(shap_df_dx)):
for j in range(i+1, len(shap_df_dx)):
features_i = set([shap_df_dx.iloc[i]['max_shap_feature'], shap_df_dx.iloc[i]['second_max_shap_feature']])
features_j = set([shap_df_dx.iloc[j]['max_shap_feature'], shap_df_dx.iloc[j]['second_max_shap_feature']])
if set.isdisjoint(features_i, features_j):
obs_different_max_shaps.append((i, j))
f'DALEX number of different pairs of obseration with two different variables with max abs shap value is {len(obs_different_max_shaps)}'
obs1, obs2 = obs_different_max_shaps[0]
obs1 = X_test.index[obs1]
obs2 = X_test.index[obs2]
# shap_values = shap_explainer(X_test.loc[[obs1],:])
# print(f'xgboost, two very different explanations, observation {obs1=}"')
# shap.waterfall_plot(shap_values[0])
i = obs1
dx_shap_values = dx_explainer.predict_parts(type="shap", label=f'observation {i}',
new_observation=X_test.loc[i,:])
dx_shap_values.plot()
i = obs2
dx_shap_values = dx_explainer.predict_parts(type="shap", label=f'observation {i}',
new_observation=X_test.loc[i,:])
dx_shap_values.plot()
obs_min_shap = shap_df_dx['shared_interests_o'].idxmin()
obs_max_shap = shap_df_dx['shared_interests_o'].idxmax()
i=obs_min_shap
dx_shap_values = dx_explainer.predict_parts(type="shap", label=f'observation {i}',
new_observation=X_test.loc[i,:])
dx_shap_values.plot()
i=obs_max_shap
dx_shap_values = dx_explainer.predict_parts(type="shap", label=f'observation {i}',
new_observation=X_test.loc[i,:])
dx_shap_values.plot()
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(pd.DataFrame(X_train))
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
LR = LogisticRegression(random_state=0).fit(X_train_scaled, y_train)
LR.score(scaler.transform(X_test), y_test)
# strange but we have to apply this (prepoces does not work with pandas df)
X_test_scaled = scaler.transform(pd.DataFrame(X_test))
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)
X_test_scaled
# i=obs_min_shap
# dx_shap_values = dx_explainer.predict_parts(type="shap", label=f'observation {i}',
# new_observation=X_test.loc[i,:])
# dx_shap_values.plot()
explainer = shap.Explainer(xgb_clf)
shap_values = explainer.shap_values(X_test_scaled)
shap_df = pd.DataFrame(shap_values,
columns=X_test_scaled.columns,
index=X_test_scaled.index
)
assert np.all(shap_df.columns == shap_df_.columns)
max_diff_obs = (shap_df_ - shap_df).abs().mean(axis=1).idxmax()
max_diff_obs
shap_explainer = shap.TreeExplainer(xgb_clf, data=X_train, model_output="probability")
shap_values = shap_explainer(X_test.loc[[max_diff_obs],:]) # shap_values # = shap_values[:, :, 1]
print(f'xgboost, {max_diff_obs=}')
shap.waterfall_plot(shap_values[0])
# shap_explainer = shap.TreeExplainer(xgb_clf, data=X_train, model_output="probability")
shap_values = shap_explainer(X_test_scaled.loc[[max_diff_obs],:]) # shap_values # = shap_values[:, :, 1]
print(f'logistic regression, {max_diff_obs=}')
shap.waterfall_plot(shap_values[0])
# 8. Comment on the results obtained in points (4)-(7)
!cd
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import numpy as np
# Lista do przechowywania modeli lasów losowych
random_forests = []
i = SEED
N = 500
clf = RandomForestClassifier(n_estimators=N, bootstrap=True, random_state=i)
clf.fit(X_train, y_train)
assert len(clf.estimators_) == N
# one estimator
shap_explainer = shap.TreeExplainer(clf.estimators_[0], data=X_train, model_output="probability")
shap_values = shap_explainer(X_test.loc[[obs[0]],:])
shap.waterfall_plot(shap_values[0,:,0])
from tqdm import trange
shap_arrays = []
for i in trange(N, desc='Trees from random forest'):
shap_explainer_clf = shap.TreeExplainer(clf.estimators_[i], data=X_train, model_output="probability")
shap_values_clf = shap_explainer_clf.shap_values(X_test)[0]
shap_arrays.append(shap_values_clf)
k = X_test.shape[0]
distance_matrix = np.zeros((k, k))
for i in trange(k, desc='Trees from random forest'):
for j in range(i, k):
diff = np.abs(shap_arrays[i] - shap_arrays[j]).mean()
distance_matrix[i,j] = np.sqrt(diff)
distance_matrix = distance_matrix + distance_matrix.T
distance_matrix.shape
order.shape, distance_matrix.shape, type(distance_matrix)
import matplotlib.pyplot as plt
order = np.argsort(distance_matrix.sum(axis=1))
plt.imshow(distance_matrix[order,:][:,order])
plt.show()
# shap_df_clf = pd.DataFrame(shap_values_clf, columns=X_test.columns, index=X_test.index)
# shap_df_clf.shape
# shap_values = shap_explainer(X_test)
# shap_values.array.shape
# shap.waterfall_plot(shap_values[0,:,0])
shap_values = explainer.shap_values(X_test)
shap_df = pd.DataFrame(shap_values, columns=X_test.columns, index=X_test.index)
shap_values[0]
from tqdm import trange
N = 500
for i in trange(N):
clf = RandomForestClassifier(n_estimators=1, bootstrap=True, random_state=i)
clf.fit(X_train, y_train)
random_forests.append(clf)
shap_arrays = []
for i in trange(10, desc='Trees from random forest'):
shap_explainer_clf = shap.TreeExplainer(random_forests[i], data=X_train, model_output="probability")
shap_values_clf = shap_explainer_clf.shap_values(X_test)[0]
shap_arrays.append(shap_values_clf)
shap_arrays[0] - shap_arrays[1]
# !pip install catboost lightgbm
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import shap
import matplotlib.pyplot as plt
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train, y_train)
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, y_train)
cb_model = cb.CatBoostClassifier(verbose=0)
cb_model.fit(X_train, y_train)
xgb_explainer = shap.TreeExplainer(xgb_model, X_train)
lgb_explainer = shap.TreeExplainer(lgb_model, X_train)
cb_explainer = shap.TreeExplainer(cb_model)
xgb_shap_values = xgb_explainer.shap_values(X_test)
lgb_shap_values = lgb_explainer.shap_values(X_test)
cb_shap_values = cb_explainer.shap_values(X_test)
sample_index = 0 # indeks próbki, dla której chcesz narysować wartości SHAP
print('XGBoost SHAP Values')
shap.summary_plot(xgb_shap_values, X_test, plot_type='bar')
print('lightGbm SHAP Values')
shap.summary_plot(lgb_shap_values, X_test, plot_type='bar')
print('CatBoost SHAP Values')
shap.summary_plot(cb_shap_values, X_test, plot_type='bar')